#!/bin/bash
# torque.setup

# Copyright 2013-2014 Amazon.com, Inc. or its affiliates. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License"). You may not use this file except in compliance with the
# License. A copy of the License is located at
#
# http://aws.amazon.com/apache2.0/
#
# or in the "LICENSE.txt" file accompanying this file. This file is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES 
# OR CONDITIONS OF ANY KIND, express or implied. See the License for the specific language governing permissions and
# limitations under the License.

# USAGE:  torque.setup <USERNAME> [<HOSTNAME>]

if [ "$1" = "" ] ; then
  echo "USAGE:  torque.setup <USERNAME> [<HOSTNAME.DOMAIN>]"
  exit 1
fi

unset PBSDEBUG
if ! pgrep trqauthd > /dev/null; then
  if ! trqauthd > /dev/null; then
    echo "trqauthd failed to start!!! exiting setup"
    exit 1
  else
    echo "trqauthd successfully started"
  fi
fi

if [ "$2" = "" ] ; then
  # obtain name obtained from reverse lookup of ip address
  HOSTNAME=$(hostname -f)
else
  HOSTNAME=$2
fi

# enable operator privileges

USER=$1

echo "initializing TORQUE (admin: $USER)"

if pgrep pbs_server > /dev/null ; then
  echo "ERROR: pbs_server already running... run 'qterm' to stop pbs_server and rerun"
  exit 1;
fi

pbs_server -f -t create
# Starting in TORQUE 3.1 the server is multi-threaded.
# We need to pause a second to allow the server to finish coming
# up. If we go to qmgr right away it will fail.
sleep 2

if ! pgrep pbs_server > /dev/null ; then
  echo "ERROR: pbs_server failed to start, check syslog and server logs for more information"
  exit 1;
fi

if ! echo set server operators += "${USER}@*" | qmgr > /dev/null ; then
  echo "ERROR: cannot set TORQUE admins"
  qterm
  exit 1;
fi

echo set server managers += "${USER}@${HOSTNAME}" | qmgr


qmgr -c 'set server scheduling = true'
qmgr -c 'set server keep_completed = 300'
qmgr -c 'set server mom_job_sync = true'
qmgr -c 'set server resources_available.nodect = <%= node['cfncluster']['cfn_max_queue_size'] %>'
qmgr -c 'set server node_check_rate = 120'
qmgr -c 'set server node_ping_rate = 60'
qmgr -c 'set server timeout_for_job_delete = 30'
qmgr -c 'set server timeout_for_job_requeue = 30'

# create default queue

qmgr -c 'create queue batch'
qmgr -c 'set queue batch queue_type = execution'
qmgr -c 'set queue batch started = true'
qmgr -c 'set queue batch enabled = true'
qmgr -c 'set queue batch resources_default.walltime = 1:00:00'
qmgr -c 'set queue batch resources_default.nodes = 1'
qmgr -c 'set queue batch resources_available.nodect = <%= node['cfncluster']['cfn_max_queue_size'] %>'

qmgr -c 'set server default_queue = batch'

qmgr -c "set server query_other_jobs = True"

qmgr -c 'create node <%= node['hostname'] %> np=1000'

# Before setting the dummy node state to offline we need to wait for the updated okclients list
# to be sent out to the cluster. Once this happens the state is set to free and that's when
# we want to transition to offline.
i=1
while pbsnodes <%= node['hostname'] %> | grep "state = .*MOM-list-not-sent" > /dev/null; do
  if [ $i -gt 240 ]; then
    # Timeout after 4 minutes
    echo "ERROR: cannot set MasterServer to offline"
    exit 1;
  fi
  i=$((i + 1))
  sleep 1
done
pbsnodes -o -N 'MasterServer' <%= node['hostname'] %>
